In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score, cross_validate, GridSearchCV
from sklearn.metrics import make_scorer, roc_auc_score, cohen_kappa_score, precision_score, recall_score, confusion_matrix, classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from lime.lime_tabular import LimeTabularExplainer
from tqdm import tqdm
import warnings
import random
In [2]:
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore", message=".*X does not have valid feature names*")

Data preparation¶

In [3]:
data = pd.read_csv('../conceptual_features.csv')
corpus = data[['text.id','text']]
In [4]:
# Recode outcome vars
data['internal_uncontrollable'] = np.where((data['locus.internal']==1) & 
                                           (data['controllability.uncontrollable']==1) & 
                                           (data['controllability.controllable']==0),1,0)
data['no.internal'] = np.where((data['locus.external']==1) & (data['locus.internal']==0),1,0)

# codebook: 1-no.attribution, 2-no.internal, 3-internal.uncontrollable
data['attribution'] = data.apply(lambda row: row['no.attribution'] + row['no.internal']*2 + row['internal_uncontrollable']*3, axis=1)
# codebook (cont.): 4-internal.controllable
data['attribution'] = data['attribution'].replace(0,4)

data = data.drop(columns=['internal_uncontrollable', 'no.internal'])
data['attribution'].value_counts()
Out[4]:
attribution
4    688
1    253
2     93
3     46
Name: count, dtype: int64
In [5]:
# Droping non-feature/target columns
data = data.drop(columns=['text.id','student.id','exam.failed','exam.date','year.of.exam','text','submit.date','year.of.submission'])
data = data.drop(columns=['exam.preparation','exam.strategies','exam.mistakes','knowledge.gap'])
data = data.drop(columns=['text.len'])

onehot = ['no.attribution',
          'locus.external','locus.internal',
          'stability.unstable','stability.stable',
          'controllability.uncontrollable','controllability.controllable']

# Targets of interest
targets = ['attribution']

# Dimensionality of the feature space
data.drop(columns=targets+onehot).info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1080 entries, 0 to 1079
Data columns (total 39 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   focuspast               1080 non-null   float64
 1   focuspresent            1080 non-null   float64
 2   focusfuture             1080 non-null   float64
 3   correctness.mistakes    1080 non-null   int64  
 4   mastery                 1080 non-null   int64  
 5   KC.experience           1080 non-null   int64  
 6   KC.knowledge            1080 non-null   int64  
 7   KC.skill                1080 non-null   int64  
 8   clinical.concept        1080 non-null   int64  
 9   concept.hygiene         1080 non-null   int64  
 10  concept.anatomy         1080 non-null   int64  
 11  concept.pathology       1080 non-null   int64  
 12  concept.patientCare     1080 non-null   int64  
 13  concept.treatment       1080 non-null   int64  
 14  concept.prosthodontics  1080 non-null   int64  
 15  clinical.procedure      1080 non-null   int64  
 16  effort                  1080 non-null   int64  
 17  studying.materials      1080 non-null   int64  
 18  strategies              1080 non-null   int64  
 19  challenge               1080 non-null   int64  
 20  exam.delivery           1080 non-null   int64  
 21  person                  1080 non-null   int64  
 22  neg_VERB                1080 non-null   int64  
 23  pos_VERB                1080 non-null   int64  
 24  neg_ADV                 1080 non-null   int64  
 25  pos_ADV                 1080 non-null   int64  
 26  neg_ADJ                 1080 non-null   int64  
 27  pos_ADJ                 1080 non-null   int64  
 28  neg_words               1080 non-null   int64  
 29  pos_words               1080 non-null   int64  
 30  overall_sentiment       1080 non-null   float64
 31  percent.neg_VERB        1080 non-null   float64
 32  percent.pos_VERB        1080 non-null   float64
 33  percent.neg_ADV         1080 non-null   float64
 34  percent.pos_ADV         1080 non-null   float64
 35  percent.neg_ADJ         1080 non-null   float64
 36  percent.pos_ADJ         1080 non-null   float64
 37  percent.neg_words       1080 non-null   float64
 38  percent.pos_words       1080 non-null   float64
dtypes: float64(12), int64(27)
memory usage: 329.2 KB
In [6]:
random_state = 6052

# Splitting hold out test set
X = data.drop(columns=targets+onehot)
y = data[targets+onehot]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=random_state, shuffle=True)
In [7]:
# Randomly pick instances by class for LIME-based model interpretation

No_Attribution = y_test.query('`no.attribution`==1').index
Int_Unstbl_Ctrl = y_test.query('(`locus.external`==0) & (`locus.internal`==1) & (`stability.unstable`==1) & (`stability.stable`==0) & (`controllability.uncontrollable`==0) & (`controllability.controllable`==1)').index
Ext_Unstbl_Ctrl = y_test.query('(`locus.external`==1) & (`locus.internal`==0) & (`stability.unstable`==1) & (`stability.stable`==0) & (`controllability.uncontrollable`==0) & (`controllability.controllable`==1)').index
Ext_Stbl_Ctrl = y_test.query('(`locus.external`==1) & (`locus.internal`==0) & (`stability.unstable`==0) & (`stability.stable`==1) & (`controllability.uncontrollable`==0) & (`controllability.controllable`==1)').index
Locus2_Stbl2_Ctrl = y_test.query('(`locus.external`==1) & (`locus.internal`==1) & (`stability.unstable`==1) & (`stability.stable`==1) & (`controllability.uncontrollable`==0) & (`controllability.controllable`==1)').index
Int_Stbl_Unctrl = y_test.query('(`locus.external`==0) & (`locus.internal`==1) & (`stability.unstable`==0) & (`stability.stable`==1) & (`controllability.uncontrollable`==1) & (`controllability.controllable`==0)').index

random.seed(random_state)
instances = pd.DataFrame(columns=['index', 'class_fineGrained', 'class_compacted'])
instances.loc[len(instances)] = [random.choice(No_Attribution), 'No attribution', 'No attribution']
instances.loc[len(instances)] = [random.choice(Int_Unstbl_Ctrl), 'Internal - Unstable - Controllable', 'Internal-Controllable']
instances.loc[len(instances)] = [random.choice(Ext_Unstbl_Ctrl), 'External - Unstable - Controllable', 'External']
instances.loc[len(instances)] = [random.choice(Ext_Stbl_Ctrl), 'External - Stable - Controllable', 'External']
instances.loc[len(instances)] = [random.choice(Locus2_Stbl2_Ctrl), 'Int/Ext - Unstbl/Stbl - Controllable', 'Internal-Controllable']
instances.loc[len(instances)] = [random.choice(Int_Stbl_Unctrl), 'Internal - Stable - Uncontrollable', 'Internal-Uncontrollable']

instances
Out[7]:
index class_fineGrained class_compacted
0 8 No attribution No attribution
1 581 Internal - Unstable - Controllable Internal-Controllable
2 959 External - Unstable - Controllable External
3 556 External - Stable - Controllable External
4 352 Int/Ext - Unstbl/Stbl - Controllable Internal-Controllable
5 686 Internal - Stable - Uncontrollable Internal-Uncontrollable

Model Training and Evaluation¶

Nested Cross Validation¶

Use nested 5-fold CV to estimate performance of the entire procedure¶

In [8]:
# Model training configs
forest = RandomForestClassifier(random_state=random_state)
param_grid = {'class_weight': ['balanced', 'balanced_subsample'],
              'n_estimators': [200, 400, 600, 800, 1000],
              'max_features': [0.15, 0.30, 0.45, 0.60, 0.75]} # sqrt(39) ≈ 6 ≈ 0.15*39 | (39/3 = 13) <=> (12 ≈ 0.3*39)

scoring = {'auc': 'roc_auc_ovo', 
           'kappa': make_scorer(cohen_kappa_score),
           'precision': 'precision_macro',
           'recall': 'recall_macro'}

results = pd.DataFrame(columns=['target', 
                                'auc_mean', 'auc_std',
                                'kappa_mean', 'kappa_std',
                                'precision_mean', 'precision_std',
                                'recall_mean', 'recall_std'])
In [9]:
# Model training and eval for each target var
for target in tqdm(targets):
    # CV configs for the inner and outer loops
    inner_cv = KFold(n_splits=5, shuffle=True, random_state=random_state)
    outer_cv = KFold(n_splits=5, shuffle=True, random_state=random_state)
    # Nested CV with hyperparameter tuning
    model = GridSearchCV(estimator=forest, param_grid=param_grid, scoring='roc_auc_ovo', cv=inner_cv)
    scores = cross_validate(model, X=X_train, y=y_train[target], scoring=scoring, cv=outer_cv)
    results.loc[len(results)] = [target, 
                                 np.mean(scores['test_auc']), np.std(scores['test_auc']),
                                 np.mean(scores['test_kappa']), np.std(scores['test_kappa']),
                                 np.mean(scores['test_precision']), np.std(scores['test_precision']),
                                 np.mean(scores['test_recall']), np.std(scores['test_recall'])]
results
100%|████████████████████████████████████████████| 1/1 [12:45<00:00, 765.77s/it]
Out[9]:
target auc_mean auc_std kappa_mean kappa_std precision_mean precision_std recall_mean recall_std
0 attribution 0.751056 0.015974 0.35132 0.072346 0.471656 0.041525 0.422468 0.025549

Evaluating and Interpreting Model on Hold Out Test Set¶

Utilities¶

In [10]:
def label(target):
    return ['no attribution', 'external', 'int-unctrl', 'int-ctrl']

def explain(instances, model, target):
    explainer = LimeTabularExplainer(training_data=np.array(X_train),
                                     feature_names=X_train.columns,
                                     mode='classification',
                                     training_labels=y_train[target].values,
                                     class_names=label(target))
    for instance in instances.itertuples():
        print(f"\n🤖: Explaining the prediction for instance {instance.index}\n")
        print(f"True label: {instance.class_compacted}")
        print(f"\nReflection: \n{corpus['text'].loc[instance.index]}\n")
        explanation = explainer.explain_instance(X_test.loc[instance.index].values, model.predict_proba, num_features=15, top_labels=4)
        explanation.show_in_notebook(show_table=True)
        

Use 5-fold CV to select the best model + Performance on hold out test data¶

In [11]:
holdout_eval = pd.DataFrame(columns=['target']+list(scoring.keys()))

for target in targets:
    print(f"\n===== {target} =====\n")
    labels = label(target)
    
    # Use 5-fold CV to select the best model
    cv = KFold(n_splits=5, shuffle=True, random_state=random_state)
    model = GridSearchCV(estimator=forest, param_grid=param_grid, scoring='roc_auc_ovo', refit=True, cv=cv)
    model.fit(X_train, y_train[target])
    print(f"Best hyperparameters: \n{model.best_params_}\n")
    print(f"Mean cross-validated auc score of the best_estimator on training set: \n{model.best_score_}\n")
    
    # Evaluate best model (refitted on the training data) on hold out test set
    print("Performance on hold out test set:")
    # > Confusion matrix
    cm = confusion_matrix(y_test[target], model.predict(X_test))
    ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels).plot()
    plt.show()
    # > Classification report
    print(classification_report(y_test[target], model.predict(X_test), target_names=labels))
    # > Performance metrics
    holdout_eval.loc[len(holdout_eval)] = [target,
                                           roc_auc_score(y_test[target], model.predict_proba(X_test), average='macro', multi_class='ovo'),
                                           cohen_kappa_score(y_test[target], model.predict(X_test)),
                                           precision_score(y_test[target], model.predict(X_test), average='macro'),
                                           recall_score(y_test[target], model.predict(X_test), average='macro')]
    
    # Explain predictions using LIME
    explain(instances, model, target)
    
===== attribution =====

Best hyperparameters: 
{'class_weight': 'balanced', 'max_features': 0.75, 'n_estimators': 400}

Mean cross-validated auc score of the best_estimator on training set: 
0.7566063538388739

Performance on hold out test set:
No description has been provided for this image
                precision    recall  f1-score   support

no attribution       0.55      0.50      0.52        22
      external       0.40      0.22      0.29         9
    int-unctrl       0.43      0.33      0.38         9
      int-ctrl       0.75      0.84      0.79        68

      accuracy                           0.68       108
     macro avg       0.53      0.47      0.49       108
  weighted avg       0.65      0.68      0.66       108


🤖: Explaining the prediction for instance 8

True label: No attribution

Reflection: 
I need to know more about RPD designs as well as which are stress bearing areas and which parts serve as direct and indirect retainers.

🤖: Explaining the prediction for instance 581

True label: Internal-Controllable

Reflection: 
I needed further revie in Oral and Maxillofacial Pathology and Radiology.

🤖: Explaining the prediction for instance 959

True label: External

Reflection: 
I should have passed but there was a discrepancy where the faculty thought I placed my hand too low on the thyroid cartilage.

🤖: Explaining the prediction for instance 556

True label: External

Reflection: 
I believe this was the admissions clinic extra-oral and intra-oral examination. I failed it the first time I took it because the faculty member I was working with did not like the way I palpated the lymph nodes in the neck and submandibular region.

🤖: Explaining the prediction for instance 352

True label: Internal-Controllable

Reflection: 
I was not sure about the information asked on the exam.

🤖: Explaining the prediction for instance 686

True label: Internal-Uncontrollable

Reflection: 
I was unable to manage time properly which is why I was unable to finish the exam. I spent too much time in answering few questions in the beginning which is why I was unable to attempt questions in the end. Also while reviewing the exam I realized that how important the proper use of terminology is in describing a lesion.

In [12]:
holdout_eval
Out[12]:
target auc kappa precision recall
0 attribution 0.816309 0.364492 0.532143 0.473448
In [ ]: